In [ ]:
import numpy as np 
import pandas as pd

%matplotlib inline
import matplotlib as mpl
import matplotlib.pyplot as plt
mpl.rc('axes', labelsize=14)
mpl.rc('xtick', labelsize=12)
mpl.rc('ytick', labelsize=12)

import seaborn as sns
In [ ]:
data = pd.read_csv("ai4i2020.csv")
In [ ]:
data.head()
Out[ ]:
UDI Product ID Type Air temperature [K] Process temperature [K] Rotational speed [rpm] Torque [Nm] Tool wear [min] Machine failure TWF HDF PWF OSF RNF
0 1 M14860 M 298.1 308.6 1551 42.8 0 0 0 0 0 0 0
1 2 L47181 L 298.2 308.7 1408 46.3 3 0 0 0 0 0 0
2 3 L47182 L 298.1 308.5 1498 49.4 5 0 0 0 0 0 0
3 4 L47183 L 298.2 308.6 1433 39.5 7 0 0 0 0 0 0
4 5 L47184 L 298.2 308.7 1408 40.0 9 0 0 0 0 0 0
In [ ]:
data.shape
Out[ ]:
(10000, 14)
In [ ]:
from ydata_profiling import ProfileReport

profile = ProfileReport(data, title="Pandas Profiling Report")
In [ ]:
profile
Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]
Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]
Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]
Out[ ]:

In [ ]:
data.describe().T
Out[ ]:
count mean std min 25% 50% 75% max
UDI 10000.0 5000.50000 2886.895680 1.0 2500.75 5000.5 7500.25 10000.0
Air temperature [K] 10000.0 300.00493 2.000259 295.3 298.30 300.1 301.50 304.5
Process temperature [K] 10000.0 310.00556 1.483734 305.7 308.80 310.1 311.10 313.8
Rotational speed [rpm] 10000.0 1538.77610 179.284096 1168.0 1423.00 1503.0 1612.00 2886.0
Torque [Nm] 10000.0 39.98691 9.968934 3.8 33.20 40.1 46.80 76.6
Tool wear [min] 10000.0 107.95100 63.654147 0.0 53.00 108.0 162.00 253.0
Machine failure 10000.0 0.03390 0.180981 0.0 0.00 0.0 0.00 1.0
TWF 10000.0 0.00460 0.067671 0.0 0.00 0.0 0.00 1.0
HDF 10000.0 0.01150 0.106625 0.0 0.00 0.0 0.00 1.0
PWF 10000.0 0.00950 0.097009 0.0 0.00 0.0 0.00 1.0
OSF 10000.0 0.00980 0.098514 0.0 0.00 0.0 0.00 1.0
RNF 10000.0 0.00190 0.043550 0.0 0.00 0.0 0.00 1.0
In [ ]:
data.select_dtypes(include=['object']).describe().T
Out[ ]:
count unique top freq
Product ID 10000 10000 M14860 1
Type 10000 3 L 6000
In [ ]:
data.columns
Out[ ]:
Index(['UDI', 'Product ID', 'Type', 'Air temperature [K]',
       'Process temperature [K]', 'Rotational speed [rpm]', 'Torque [Nm]',
       'Tool wear [min]', 'Machine failure', 'TWF', 'HDF', 'PWF', 'OSF',
       'RNF'],
      dtype='object')
In [ ]:
num_cols = ['Air temperature [K]', 'Process temperature [K]',
            'Rotational speed [rpm]', 'Torque [Nm]', 'Tool wear [min]']
cat_cols = ['Type', 'Failure type']
label = 'Machine failure'
In [ ]:
data['Air temperature [K]'][data['Air temperature [K]'].isna().astype(int) == 1]
Out[ ]:
Series([], Name: Air temperature [K], dtype: float64)
In [ ]:
plt.figure(figsize=(12, 12))
for i, col in enumerate(num_cols):
    plt.subplot(3, 2, i+1)
    sns.histplot(data, x=col, kde=True, alpha=0.2, color='red', bins=15)
    plt.title(col)
plt.suptitle("Data Distributions", fontsize=15)
plt.tight_layout()
plt.show()
c:\ProgramData\anaconda3\Lib\site-packages\seaborn\_oldcore.py:1119: FutureWarning: use_inf_as_na option is deprecated and will be removed in a future version. Convert inf values to NaN before operating instead.
  with pd.option_context('mode.use_inf_as_na', True):
c:\ProgramData\anaconda3\Lib\site-packages\seaborn\_oldcore.py:1119: FutureWarning: use_inf_as_na option is deprecated and will be removed in a future version. Convert inf values to NaN before operating instead.
  with pd.option_context('mode.use_inf_as_na', True):
c:\ProgramData\anaconda3\Lib\site-packages\seaborn\_oldcore.py:1119: FutureWarning: use_inf_as_na option is deprecated and will be removed in a future version. Convert inf values to NaN before operating instead.
  with pd.option_context('mode.use_inf_as_na', True):
c:\ProgramData\anaconda3\Lib\site-packages\seaborn\_oldcore.py:1119: FutureWarning: use_inf_as_na option is deprecated and will be removed in a future version. Convert inf values to NaN before operating instead.
  with pd.option_context('mode.use_inf_as_na', True):
c:\ProgramData\anaconda3\Lib\site-packages\seaborn\_oldcore.py:1119: FutureWarning: use_inf_as_na option is deprecated and will be removed in a future version. Convert inf values to NaN before operating instead.
  with pd.option_context('mode.use_inf_as_na', True):
No description has been provided for this image
In [ ]:
plt.figure(figsize=(10, 7))
for i, col in enumerate(num_cols):
    plt.subplot(2, 3, i+1)
    sns.rugplot(data, x=col, hue=label, height=0.1)
    sns.boxplot(data, x=col, width=0.25)
plt.suptitle("Data Distributions")
plt.tight_layout()
plt.show()
c:\ProgramData\anaconda3\Lib\site-packages\seaborn\_oldcore.py:1119: FutureWarning: use_inf_as_na option is deprecated and will be removed in a future version. Convert inf values to NaN before operating instead.
  with pd.option_context('mode.use_inf_as_na', True):
c:\ProgramData\anaconda3\Lib\site-packages\seaborn\_oldcore.py:1119: FutureWarning: use_inf_as_na option is deprecated and will be removed in a future version. Convert inf values to NaN before operating instead.
  with pd.option_context('mode.use_inf_as_na', True):
c:\ProgramData\anaconda3\Lib\site-packages\seaborn\_oldcore.py:1119: FutureWarning: use_inf_as_na option is deprecated and will be removed in a future version. Convert inf values to NaN before operating instead.
  with pd.option_context('mode.use_inf_as_na', True):
c:\ProgramData\anaconda3\Lib\site-packages\seaborn\_oldcore.py:1119: FutureWarning: use_inf_as_na option is deprecated and will be removed in a future version. Convert inf values to NaN before operating instead.
  with pd.option_context('mode.use_inf_as_na', True):
c:\ProgramData\anaconda3\Lib\site-packages\seaborn\_oldcore.py:1119: FutureWarning: use_inf_as_na option is deprecated and will be removed in a future version. Convert inf values to NaN before operating instead.
  with pd.option_context('mode.use_inf_as_na', True):
No description has been provided for this image
In [ ]:
plt.figure(figsize=(10, 7))
for i, col in enumerate(num_cols):
    plt.subplot(2, 3, i+1)
    sns.boxplot(data, x=label, y=col, width=0.5)
plt.suptitle("Data Distribution in Relation to Machine Failure")
plt.tight_layout()
plt.show()
No description has been provided for this image
In [ ]:
#plt.figure(figsize = (6,6))
# , square = True, annot = True, cmap = 'Blues', linewidths = 0.5)
sns.heatmap(data[num_cols].corr(), annot=True, fmt=".2f")
plt.title("Heatmap Analysis")
plt.show()
No description has been provided for this image
In [ ]:
data[num_cols].corr()
Out[ ]:
Air temperature [K] Process temperature [K] Rotational speed [rpm] Torque [Nm] Tool wear [min]
Air temperature [K] 1.000000 0.876107 0.022670 -0.013778 0.013853
Process temperature [K] 0.876107 1.000000 0.019277 -0.014061 0.013488
Rotational speed [rpm] 0.022670 0.019277 1.000000 -0.875027 0.000223
Torque [Nm] -0.013778 -0.014061 -0.875027 1.000000 -0.003093
Tool wear [min] 0.013853 0.013488 0.000223 -0.003093 1.000000
In [ ]:
data.plot.hexbin(x='Air temperature [K]', y='Process temperature [K]',
                 gridsize=20, cmap='Purples', figsize=(5, 4))
plt.title("Hexbin Plot Between Process Temperature and Air Temperature")
plt.show()
No description has been provided for this image
In [ ]:
data.plot.hexbin(x='Rotational speed [rpm]', y='Torque [Nm]',
                 gridsize=30, cmap='Purples', figsize=(5, 4))
plt.title("Hexbin Plot Between Torque and Rotational speed")
plt.show()
No description has been provided for this image
In [ ]:
type_machine_failure = data[['Type', 'Machine failure']].pivot_table(index='Type', columns='Machine failure', aggfunc= lambda x: len(x), margins = True)
print(type_machine_failure)
plt.figure(figsize=(6,6))
sns.heatmap(type_machine_failure, annot=True, fmt='g', cmap='Blues', cbar=False, linewidths=0.5)
plt.title("Type vs Machine Failure")
plt.show()
Machine failure     0    1    All
Type                             
H                 982   21   1003
L                5765  235   6000
M                2914   83   2997
All              9661  339  10000
No description has been provided for this image
In [ ]:
from pycaret.classification import *
s = setup(data, target = 'Machine failure', session_id = 42, data_split_stratify=True)
  Description Value
0 Session id 42
1 Target Machine failure
2 Target type Binary
3 Original data shape (10000, 14)
4 Transformed data shape (10000, 16)
5 Transformed train set shape (7000, 16)
6 Transformed test set shape (3000, 16)
7 Numeric features 11
8 Categorical features 2
9 Preprocess True
10 Imputation type simple
11 Numeric imputation mean
12 Categorical imputation mode
13 Maximum one-hot encoding 25
14 Encoding method None
15 Fold Generator StratifiedKFold
16 Fold Number 10
17 CPU Jobs -1
18 Use GPU False
19 Log Experiment False
20 Experiment Name clf-default-name
21 USI 4ba6
In [ ]:
best_model = compare_models(sort = 'AUC')
Initiated . . . . . . . . . . . . . . . . . . 09:37:04
Status . . . . . . . . . . . . . . . . . . Loading Dependencies
Estimator . . . . . . . . . . . . . . . . . . Compiling Library
In [ ]:
plt.figure(figsize = (4,3))
plot_model(best_model, plot = 'confusion_matrix')
No description has been provided for this image
In [ ]:
plt.figure(figsize = (5,4))
plot_model(best_model, plot = 'auc')
No description has been provided for this image
In [ ]:
plt.figure(figsize = (5, 4))
plot_model(best_model, plot = 'learning')
No description has been provided for this image
In [ ]:
plot_model(best_model, plot = 'feature')
No description has been provided for this image
In [ ]:
save_model(best_model, "ai4i2020_pycaret_model")
Transformation Pipeline and Model Successfully Saved
Out[ ]:
(Pipeline(memory=Memory(location=None),
          steps=[('numerical_imputer',
                  TransformerWrapper(exclude=None,
                                     include=['UDI', 'Air temperature [K]',
                                              'Process temperature [K]',
                                              'Rotational speed [rpm]',
                                              'Torque [Nm]', 'Tool wear [min]',
                                              'TWF', 'HDF', 'PWF', 'OSF',
                                              'RNF'],
                                     transformer=SimpleImputer(add_indicator=False,
                                                               copy=True,
                                                               fill_value=None,
                                                               keep_empty_features=False,
                                                               missing_valu...
                  TransformerWrapper(exclude=None, include=None,
                                     transformer=CleanColumnNames(match='[\\]\\[\\,\\{\\}\\"\\:]+'))),
                 ('trained_model',
                  LogisticRegression(C=1.0, class_weight=None, dual=False,
                                     fit_intercept=True, intercept_scaling=1,
                                     l1_ratio=None, max_iter=1000,
                                     multi_class='auto', n_jobs=None,
                                     penalty='l2', random_state=42,
                                     solver='lbfgs', tol=0.0001, verbose=0,
                                     warm_start=False))],
          verbose=False),
 'ai4i2020_pycaret_model.pkl')
In [ ]:
plot_model(best_model, plot = 'calibration')
No description has been provided for this image
In [ ]:
calibrated_model = calibrate_model(best_model)
Initiated . . . . . . . . . . . . . . . . . . 09:38:35
Status . . . . . . . . . . . . . . . . . . Loading Dependencies
Estimator . . . . . . . . . . . . . . . . . . Compiling Library
  Accuracy AUC Recall Prec. F1 Kappa MCC
Fold              
0 0.9971 0.9670 0.9130 1.0000 0.9545 0.9531 0.9541
1 1.0000 1.0000 1.0000 1.0000 1.0000 1.0000 1.0000
2 0.9971 0.9558 0.9130 1.0000 0.9545 0.9531 0.9541
3 0.9986 1.0000 0.9583 1.0000 0.9787 0.9780 0.9782
4 0.9971 0.9581 0.9167 1.0000 0.9565 0.9550 0.9560
5 1.0000 1.0000 1.0000 1.0000 1.0000 1.0000 1.0000
6 1.0000 1.0000 1.0000 1.0000 1.0000 1.0000 1.0000
7 0.9986 1.0000 0.9583 1.0000 0.9787 0.9780 0.9782
8 1.0000 1.0000 1.0000 1.0000 1.0000 1.0000 1.0000
9 0.9986 0.9767 0.9583 1.0000 0.9787 0.9780 0.9782
Mean 0.9987 0.9858 0.9618 1.0000 0.9802 0.9795 0.9799
Std 0.0012 0.0182 0.0356 0.0000 0.0186 0.0192 0.0188
In [ ]:
plot_model(calibrated_model, plot = 'calibration')
No description has been provided for this image
In [ ]:
automl()
Out[ ]:
RidgeClassifier(alpha=1.0, class_weight=None, copy_X=True, fit_intercept=True,
                max_iter=None, positive=False, random_state=42, solver='auto',
                tol=0.0001)
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
RidgeClassifier(alpha=1.0, class_weight=None, copy_X=True, fit_intercept=True,
                max_iter=None, positive=False, random_state=42, solver='auto',
                tol=0.0001)
In [ ]:
#create_app(best_model)